data gathered from https://data.humdata.org/dataset/novel-coronavirus-2019-ncov-cases
Here I am doing a few things:
I made a row for each status, including a new active status
I got rid of all columns except Country, Status, Value, Date
I added a new region called “Global”, this holds all the values for every country combined
options(stringsAsFactors=FALSE)
library(plyr)
library(dplyr)
library(reshape2)
library(ggplot2)
library(plotly)
library(highcharter)
#function for grabbing status from my filenames
getStatus<- function(x){
strsplit(x, "-")[[1]] %>%
last() %>%
gsub(pattern="\\.csv", replacement="")
}
#function created for adding an active status
createActive <- function(x){
dcast(Country.Region + Date ~ Status,
data=x, value.var="Value") %>%
mutate(Active = Confirmed - (Deaths + Recovered)) %>%
melt(id = c("Country.Region", "Date"))
}
#function used to convert a dataset from long to wide
convert <- function(x){ dcast(Country + Date ~ Status,
data=x, value.var="Value")}
###########################################
### This is where I start cleaning the data
files <- list.files("raw", full.names = TRUE)
raw <- files %>% #read in files
lapply(function(x){
read.csv(x) %>%
mutate(
Date = as.Date(Date, "%m/%d/%Y"),
Status = getStatus(x) #add status
)
}) %>%
bind_rows() %>% #combine files
subset(!(Value==0) )
raw <- raw %>% #combine countries into one
group_by(Country.Region, Date, Status) %>%
summarise(
Value = sum(Value)
)
raw <- raw %>% #get global stats
group_by(Date, Status) %>%
summarise(
Value = sum(Value)
) %>%
ungroup() %>%
mutate(
Country.Region = "Global"
) %>%
bind_rows(raw)
raw <- raw %>% #create active columns, delete nulls, rename
createActive() %>%
subset(!is.na(value)) %>%
rename(
Country = Country.Region,
Value = value,
Status = variable
)
total <- raw %>% #Get total values for seperate df
group_by(Country, Status) %>%
summarise(
Value = max(Value)
) %>%
ungroup() %>%
mutate(
Status = as.character(Status)
)
#get change per day
raw <- raw %>%
group_by(Country, Status) %>%
mutate(
Change = Value - lag(Value, k=1),
Rate_Change = (Value - lag(Value, k=1))/lag(Value, k=1)
) %>%
ungroup() %>%
mutate(
Status = as.character(Status)
)Now that the data is clean, lets start digging into the data!
plot_data <- total %>%
subset(Country %in% c("US", "China", "Italy", "Germany", "Spain")) %>%
mutate(
Cases = Value
)
p <- plot_data %>%
ggplot(aes(reorder(Status,Cases), Cases, fill = Country),
text = paste("Cases:", Cases))+
geom_bar(stat="identity")+ coord_flip()+ xlab("Status of Case")+ ylab("Total Cases")+ ggtitle("Number of Cases for top 5 countries")
p <- ggplotly(p)
pThe above graph shows the variance of cumulative cases by type, and by country. We can see some pretty interesting things here. For example, Italy has the highest deaths and active cases, yet they are only second in the number of confirmed cases. This could mean a few things:
Italy may not have been testing enough people, which would bring their Mortality Rate up.
Italys’ healthcare system is getting overrun, and their deaths are about to skyrocket.
total %>%
subset(Country %in% c("US", "China", "Italy", "Germany", "Spain")) %>%
subset(Status == "Deaths") %>%
hchart("treemap",
colorByPoint = TRUE,
hcaes(x = Country, value = Value)) %>%
hc_add_theme(hc_theme_flat()) %>%
hc_title(text = "Total Deaths by Country",
align = "left")This graph perfectly shows just how many Deaths Italy has compared to our top 5 countries. At the time of writing this, Italy has just about doubled their death count over Chinas’!
For the remainder of this analysis, I will be focusing on Italy.
Theres a couple things to keep in mind when looking at the rate of change.
It has more variance in the beginning.
As the number of cases get larger, the rate will tend to even out, or even decline.
This is the basic Exponential growth function:
\(f(x) = P_0(1+r)^d\)
The rate is what gets exponentiated for an exponential function, ie, if our mean rate is higher, our confirmed cases will get exponentially higher with each day. Also, if the rate is negative that means our cases per day has dropped from the previous days’ number. If there starts to be a trend of negative rates, this means that the disease is starting to slow down, which could mean we are past the peak.
plot_data <- raw %>%
subset(Country == "Italy") %>%
subset(!(is.na(Rate_Change))) %>%
subset(Rate_Change <= 1)
p_c = plot_data %>%
subset(Status == "Confirmed")
p_d = plot_data %>%
subset(Status == "Deaths")
p_a = plot_data %>%
subset(Status == "Active")
hchart(density(p_a$Rate_Change), type = "area", color = "yellow", name = "Rate of Active") %>%
hc_add_series(density(p_c$Rate_Change), type = "area", name = "Rate of Confirmed", color = "green") %>%
hc_add_series(density(p_d$Rate_Change), type = "area", name = "Rate of Deaths", color = "red") %>%
hc_add_theme(hc_theme_flat()) %>%
hc_title(text = "Distribion of the Rate of Change, per Day",
align = "left") %>%
hc_subtitle(text = "For Italy",
align = "left") %>%
hc_yAxis(labels = list(format = "{value}")) %>%
hc_xAxis(labels = list(format = "{value}"),
title = list(text = "Rate of Change"))%>%
hc_legend(align = "left") %>%
hc_tooltip(formatter = JS("function(){
return ('Rate of Change: ' + this.y)}"))This graph clearly shows that the deaths rate of change has not only a higher average, but most of the rates are higher than the rate of confirmed. This tells us the number of deaths are going up much faster than our confirmed cases are.
There are a multitude of reasons why this could happen:
The Confirmed cases are lagging behind, due to the incubation period.
The Confirmed cases are innacurate, as finding this true value is difficult.
Our Mortality rate is not static
Rates tend to even out over time, meaning in the beginning stages the rate of change will be much higher than later on.
raw %>%
subset(Country == "Italy") %>%
subset(!(is.na(Rate_Change))) %>%
subset(Status == "Deaths") %>%
hchart( type = "scatter",
hcaes(y = Rate_Change, x = Change),
color = "red",
regression = TRUE,
borderColor = '#EBBA95',
borderRadius = 10,
borderWidth = 2) %>%
hc_add_theme(hc_theme_flat()) %>%
hc_add_dependency("plugins/highcharts-regression.js") %>%
hc_title(text = "Rate of Change vs Actual Change",
align = "left") %>%
hc_subtitle(text = "For Italys' Deaths",
align = "left") %>%
hc_yAxis(labels = list(format = "{value}"),
title = list(text = "Rate of change")) %>%
hc_xAxis(labels = list(format = "{value}"),
title = list(text = "Cases per Day")) %>%
hc_legend(align = "left") %>%
hc_tooltip(formatter = JS("function(){
return ('Date: ' + this.point.Date + ' <br> Rate of Change: ' + this.y + ' <br> Cases per Day: ' + this.x)}"))Looking at this scatter plot, as the number of Deceased cases rise, the Rate of Change trends downwards. This means that Italy has a negative correlation between the actual change in cases per day and the rate of change. This is a good thing, as it could mean a few things:
Lets find out how well correlated the two actually are, using pearsons’ method.
Before we start, I will get the outliers out of the data, as some of the early rates are innacurate, for the reasons described before.
corr <- raw %>% #subsetting our data
subset(Country == "Italy") %>%
subset(!(is.na(Rate_Change))) %>%
subset(!(is.na(Change))) %>%
subset(Status == "Deaths", select = c(Value, Change, Rate_Change))
rc_sd = sd(corr$Rate_Change) #standard deviation for rate of change
rc_mean = mean(corr$Rate_Change) #mean for rate of change
cd_sd = sd(corr$Change) #standard deviation for cases per day
cd_mean = mean(corr$Change) #mean for cases per day
corr <- corr %>% #removing outliers
subset(rc_mean-rc_sd*3 <= Rate_Change | Rate_Change <= rc_mean+rc_sd*3) %>%
subset(cd_mean-cd_sd*3 <= Change | Change <= cd_mean+cd_sd*3) %>% subset(Rate_Change < 1) #we have a few outliers that need to go
std_corr <- corr %>%
mutate(
Rate_Change = (Rate_Change - rc_mean)/rc_sd
) %>%
subset(Rate_Change <= 1) #we have a few outliers that need to goFirst, lets check to see if our Rate of Change is relatively normal.
##
## Shapiro-Wilk normality test
##
## data: std_corr$Rate_Change
## W = 0.94448, p-value = 0.1314
This tells us that our p value is 0.1314 > 0.05
This is great news! It means our data is normal, and this means that we can use our Rate of Deaths as a predictor for our model!
hchart(density(std_corr$Rate_Change), type = "area", color = "red", name = "Rate of Deaths") %>%
hc_add_theme(hc_theme_flat()) %>%
hc_title(text = "Distribion of the Rate of Change, per Day",
align = "left") %>%
hc_subtitle(text = "For Italy",
align = "left") %>%
hc_yAxis(labels = list(format = "{value}")) %>%
hc_xAxis(labels = list(format = "{value}"),
title = list(text = "Rate of Change"))%>%
hc_legend(align = "left") %>%
hc_exporting(enabled = TRUE,
filename = "it-dist-roc") %>%
hc_tooltip(formatter = JS("function(){
return ('Rate of Change: ' + this.y)}"))corr %>%
cor() %>% #get correlation
round(3) %>% #round
hchart() %>% #graph
hc_add_theme(hc_theme_flat()) %>%
hc_title(text = "Pearsons r matchup",
align = "left") %>%
hc_subtitle(text = "For Italys' Deceased Cases",
align = "left") %>%
hc_xAxis(categories = list("Cumulative Deaths", "Cases per Day", "Rate of Change")) %>%
hc_yAxis(categories = list("Cumulative Deaths", "Cases per Day", "Rate of Change"))%>%
hc_legend(align = "left") %>%
hc_plotOptions(
series = list(
boderWidth = 0,
dataLabels = list(enabled = TRUE)))As expected there is a negative correlation between the Rate of Change and Total cases, aswell as a negative correlation between our Rate of Change and the cases per day.
This makes sense as the amount of cases increase, our rate should start having less variations. This is also great news, as it means our rate is trending downwards, ultimately suggesting that the rate of Deaths per day is slowing down.
corr %>%
cor() %>% #get correlation
'^'(2) %>% #square numbers
round(3) %>% #round
hchart() %>% #graph
hc_add_theme(hc_theme_flat()) %>%
hc_title(text = "Pearsons r squared matchup",
align = "left") %>%
hc_subtitle(text = "For Italys' Deceased Cases",
align = "left") %>%
hc_xAxis(categories = list("Cumulative Deaths", "Cases per Day", "Rate of Change")) %>%
hc_yAxis(categories = list("Cumulative Deaths", "Cases per Day", "Rate of Change"))%>%
hc_legend(align = "left") %>%
hc_plotOptions(
series = list(
boderWidth = 0,
dataLabels = list(enabled = TRUE)))The r squared value can tell us much more, for example, we know that 88.7% of our Cases per day can be explained by our Total cases! This also tells us that 29.4% of our Cumulative cases can be explained by the variations in the Rate of Change.
Let’s check if these values are statistically significant, to do that we need the below formula to get our test statistic.
\({ts} = \frac{r*\sqrt[2]{n-2}}{\sqrt[2]{1-r^2}}\)
df = nrow(corr)-2 #our degree of freedom
cv = qt(.975, df) #our test statistic
r <- corr %>% #correlating our data for r values
cor()
call_ts <- function(r, df){#function for getting the test statistic
ts= abs((r*(df)^.5)/(1-(r)^2)^.5) #formula
return (ts)
}
#changing our matrix to be the test statistics
ts <- call_ts(r, df)
ts## Value Change Rate_Change
## Value Inf 14.245784 3.355033
## Change 14.245784 Inf 2.206968
## Rate_Change 3.355033 2.206968 Inf
These are our test statistics for each r value, now we need to use the below formula to check if the r value is statistically significant.
\({-ts} > r > ts\)
Here is a matrix of our r values.
## Value Change Rate_Change
## Value 1.0000000 0.9394569 -0.5424324
## Change 0.9394569 1.0000000 -0.3909310
## Rate_Change -0.5424324 -0.3909310 1.0000000
This tells us that our all our correlations are statistically significant! Which means that we can use all of our factors as a predictor of the other.
I will be using the correlation between Italys’ Deaths rate of change to predict our Deaths per day values, as this is a statistically significant correlation, and the math lines up.
There are two things I want to point out about Covid-19:
Using these two facts, we can estimate the average time from point of infection to death is 23 days (5 incubation days + 18 days till death). This means that if someone dies today from Covid-19 we can estimate that they got infected 23 days prior.
sources:
https://www.uptodate.com/contents/coronavirus-disease-2019-covid-19?source=history_widget
https://www.thelancet.com/journals/lancet/article/PIIS0140-6736(20)30566-3/fulltext
I want to first build a model of Italys’ predicted Deaths per day, this can further help us find the actual number of confirmed cases!
There are a few things I want to note:
So lets get started!
\({Deaths}={Infected}*{MortalityRate}\)
This is our formula to find the number of Deaths from any given population of infected individuals. We are focusing on the Deaths per day, as I want a per day model.
This means we need to take the derivative of our formula, with respect to time. This leads into a two options:
The Mortality Rate is a constant value for the population.
The mortality Rate is a value that also changes with time.
\(D=Deaths\)
\(I=Infected\)
\(R=MortalityRate\)
If we take the derivative with Mortality Rate as a variable we get the following:
\(\frac{dD}{dt}=\frac{dI}{dt}*\frac{dD}{dI}+\frac{dR}{dt}*\frac{dD}{dR}\)
\(\frac{dD}{dt}=\frac{dI}{dt}*R+\frac{dR}{dt}*I\)
In plain terms, the change in Deaths is equal to the change in infections, time the Mortality Rate, plus, the change in Mortality Rate, times the infections.
There is one major problem with this, we cannot make an accurate estimate on the Mortality Rate, which means we also cannot make an accurate estimate on the change in Mortality Rate. So, while this would be the better option, I do not believe it is feasable with the data we have now, as we cannot get an accurate Mortality Rate.
However, taking the derivative with Mortality Rate as a constant gives us the following:
which also tells us..
In plain terms, we get that the change in Deaths is equal to the change in the infections, times the Mortality Rate. We also know that the change in infections is equal to the change in Deaths, divided by the Mortality Rate.
This means that with a constant Mortality Rate, the change in Deaths will also be the same as the change in Infections! As the rate is just scaling the numbers. This tells us we can make predictions on the change in Infections by using the change in Deaths, scaling with the Mortality Rate afterwards!
Using this information, I will build my model with Mortality Rate as a constant.
\(f(t)=P_0*(1+r)^t\)
We are calculating this one day at a time, which means our t value will be 1, giving us this:
FIXING FIXING FIXING FIXING
\(f(1)=P_0*(1+r)\)
\(\frac{f(1)}{P_0}=1+r\)
\(r=\frac{f(1)}{P_0}-1\)
\(r=\frac{\Delta f(1)}{P_0}\)
\(s.t.f(0)=P_0\)
\(r=\frac{f(1)-f(0)}{f(0)}\)
\(r_1-r_0=\frac{f(1)-f(0)}{P_0}\)
\(r_1=\frac{f(1)-f(0)}{P_0}+r_0\)
\(f(1)=(r_0*P_0)-(r_1*P_0)+P_0._\square\)
FIXING FIXING FIXING FIXING
In plain terms, the day to predict is equal to the change in our rate, times the previous day, plus the previous day. This means, if we try to predict the change in our rate, we can find our predicted Cumulative Deaths!
To start, there are a few things to note about how I built this algorithm:
Lets dive in by taking a look at how our rates of change work!
To start, we know how much our Deaths are changing per day, we can formulate what percentage this change is from our cumulative Deaths. To do this, we need the following formula:
\(r=\frac{\Delta f(t)}{P_0}\)
In plain terms, this formula takes the change per day of our deaths, and divides that by our previous days cumulative Deaths! Making the deaths per day into a percentage of our cumulative deaths!
split_to_country <- function(data, country){
#take data and split by country
#first function for model
country_data <- data %>%
subset(Country == country) %>% #getting italy
convert() %>% #function converts to wide format
subset(select = c(Country, Date, Deaths, Confirmed)) %>%
mutate( #adding columns for per day change, and rate per day
Deaths_Per_Day = Deaths - lag(Deaths, k=1),
Deaths_Rate_Change = (Deaths - lag(Deaths, k=1))/lag(Deaths, k=1),
Mortality_Rate = Deaths/Confirmed
)
#cleaning data for further analysis
country_data <- country_data %>%
subset(select = c(Date, Deaths, Deaths_Per_Day, Deaths_Rate_Change)) %>%
subset(!is.na(Deaths_Per_Day)) %>%
mutate(
Deaths_PD = Deaths_Per_Day,
Deaths_RC = Deaths_Rate_Change
) %>%
subset(select = c(Date, Deaths, Deaths_PD, Deaths_RC))
return (country_data)
}
italy <- raw %>%
split_to_country("Italy")
#showing tail values
tail(italy,5)## Date Deaths Deaths_PD Deaths_RC
## 27 2020-03-19 3405 427 0.1433848
## 28 2020-03-20 4032 627 0.1841410
## 29 2020-03-21 4825 793 0.1966766
## 30 2020-03-22 5476 651 0.1349223
## 31 2020-03-23 6077 601 0.1097516
Now that this step is complete, we can calculate the change in our death rate of change. Essentially looking at how much our rates are fluctiating over time. We can do this simply by taking our first deaths rate of change and subtracting the second days rate of change from that. This will mean if the rate goes up, the change will go up as well!
get_dr_c <- function(data){
#Calulating the change in our rates of change
data <- data %>%
mutate(
D_RC_C = Deaths_RC - lag(Deaths_RC,k=1)
)
return (data)
}
italy <- italy %>%
get_dr_c()
tail(italy,5)## Date Deaths Deaths_PD Deaths_RC D_RC_C
## 27 2020-03-19 3405 427 0.1433848 -0.04638745
## 28 2020-03-20 4032 627 0.1841410 0.04075615
## 29 2020-03-21 4825 793 0.1966766 0.01253562
## 30 2020-03-22 5476 651 0.1349223 -0.06175431
## 31 2020-03-23 6077 601 0.1097516 -0.02517064
Great, here are Italys’ cumulative Deaths, Deaths per day, Deaths rate of change, and the change in the rate of change. We will use all this information to work backwards at the end, to fill in the predictions.
##save og data!
og <- italy
split_for_graph <- function(data, split){
data <- data %>%
subset(!is.na(D_RC_C))
#taking the bottom 8 samples
data <- data[ceiling(nrow(data)-(split-1)) : nrow(data),]
mu = data %>%
subset(select = Deaths_PD) %>%
as.matrix() %>%
mean()
sd = data %>%
subset(select = Deaths_PD) %>%
as.matrix() %>%
sd()
#getting rid of outliers in the deaths per day
data <- data %>%
subset(mu-sd*3 < Deaths_PD & Deaths_PD < mu+sd*3)
mu <- data %>%
subset(select = D_RC_C) %>%
as.matrix() %>%
mean()
sd <- data %>%
subset(select = D_RC_C) %>%
as.matrix() %>%
sd()
#getting rid of outliers in the change of the rate of change in deaths
data <- data %>%
subset(mu-sd*3 < D_RC_C & D_RC_C < mu+sd*3)
return (data)
}
italy <- italy %>%
split_for_graph(8)hchart(density(italy$D_RC_C), type = "area", color = "red", name = "Rate of Deaths") %>%
hc_add_theme(hc_theme_flat()) %>%
hc_title(text = "Distribion of the Change in Rate of Change",
align = "left") %>%
hc_subtitle(text = "For Italy",
align = "left") %>%
hc_yAxis(labels = list(format = "{value}")) %>%
hc_xAxis(labels = list(format = "{value}"),
title = list(text = "Change in Rate of Change of Deaths"))%>%
hc_legend(align = "left") %>%
hc_tooltip(formatter = JS("function(){
return ('Rate of Change: ' + this.y)}"))This is the distribution of the rate of change, we want to take a look at this to make sure it is relatively normal, otherwise there will be a different method needed to make predictions from.
##
## Shapiro-Wilk normality test
##
## data: italy$D_RC_C
## W = 0.89634, p-value = 0.2677
Exactly what we wanted to see, our p value is > 0.05, meaning our distribution is normal enough to find the probabilities!
Now that we have the changes per day, we can build the prediction model. I will break this down into steps:
Use a changing mean and standard deviation:
Generate a random number for each day, per trial
Calculate backwards to get the Cumulative Deaths
### functions used inside function below
get_rc <- function(death_rc_n1, change){
#rc = lag death_rc + change
rc=death_rc_n1+change
return(rc)
}
get_pd <- function(deaths_n1, deaths_rc){
#lag death*roc = pd
pd = (deaths_rc*deaths_n1)
if (pd <0){
pd = pd * -1
}
return(ceiling(pd))
}
get_death <- function(lag_death = lag_death, dpd){
#lag death + death per dat = next cumulative death
if(dpd <0){
dpd <- dpd * -1
}
death = lag_death+dpd
return(ceiling(death))
}
### function used for prediction model
prediction_model <- function(data, days, trials, split, curr_d){
curr_d = max(data[data$D_RC_C != 0,"Date"])
for (n in 1:trials){
temp <- data %>%
subset(select = c(Date, D_RC_C))
for (i in 1:days){
mu <- temp %>% #grabbing mean
subset(select = D_RC_C) %>%
slice(nrow(temp)-(nrow(temp)-split):nrow(temp)+i) %>%
as.matrix() %>%
mean()
sd <- temp %>% #grabbing standard deviation
subset(select = D_RC_C) %>%
slice(nrow(temp)-(nrow(temp)-split):nrow(temp)+i) %>%
as.matrix() %>%
sd()
set.seed(i*13*n)
temp <- temp %>%
bind_rows(data.frame(Date = as.Date(curr_d, format = "%Y-%m-%d")+i, D_RC_C = rnorm(1, mu, sd))) %>%
mutate(
Trial = n
)
}
if (n == 1){
trial_data <- temp
}
else{
trial_data <- temp %>%
bind_rows(trial_data)
}
}
return (trial_data)
}
curr_d <- max(italy[italy$D_RC_C != 0,"Date"])
TRIALS <- 20
DAYS <- 10
SPLIT <- 8
italy <- italy %>%
prediction_model(days = DAYS, trials = TRIALS, split = SPLIT, curr_d) %>%
group_by(Trial, Date) %>%
slice(n()) %>%
ungroup() %>%
mutate(
Deaths = NA,
Deaths_PD = NA,
Deaths_RC = NA,
Country = "Italy"
)calc_deaths <- function(data, trials, curr_d){
for (i in 1:trials){
for (d in data$Date){
if (d > as.Date(curr_d, format = "%Y-%m-%d")){
#getting roc
data[data$Date == d & data$Trial == i,6] <- get_rc(data[data$Date == d-1 & data$Trial == i,6], data[data$Date == d & data$Trial == i,2])
#getting deaths per day
data[data$Date == d & data$Trial == i,5] <- get_pd(data[data$Date == d-1 & data$Trial == i,4], data[data$Date == d & data$Trial == i,6])
#getting cumulative deaths
data[data$Date == d & data$Trial == i,4] <- get_death(data[data$Date == d-1 & data$Trial == i,4], data[data$Date == d & data$Trial == i,5])
}
}
}
return (data)
}
italy <- italy %>%
subset(Date >= as.Date(curr_d, format = "%Y-%m-%d"))
for (d in og$Date){
italy[italy$Date == d,"Deaths"] <- og[og$Date == d,"Deaths"]
italy[italy$Date == d,"Deaths_PD"] <- og[og$Date == d,"Deaths_PD"]
italy[italy$Date == d,"Deaths_RC"] <- og[og$Date == d,"Deaths_RC"]
}
italy <- italy %>%
calc_deaths(TRIALS, curr_d)hc <- highchart()
pn <- italy %>%
subset(Date >= curr_d)
for (i in 1:TRIALS){
hc <- hc %>%
hc_add_series(name = paste("SImulation", as.character(i), sep=" "), data = pn[pn$Trial == i,c("Deaths", "Date", "Trial")], type = "line", hcaes(x=Date, y=Deaths), showInLegend = F)
}
hc <- hc %>%
hc_xAxis(type = "datetime", dateTimeLabelFormats = list(day = '%d/%b')) %>%
hc_add_theme(hc_theme_flat()) %>%
hc_title(text = "Italys' projected Deaths",
align = "left") %>%
hc_subtitle(text = "#Simulations = 20, #days = 10, using last 8 days mu/sd",
align = "left") %>%
hc_yAxis(labels = list(format = "{value}"), title = list(text = "Cumulative Number of Deaths")) %>%
hc_xAxis(title = list(text = "Date"),
plotBands = list(
list(
label = list(text = "This is the Predicted Data"),
color = "lightblue",
from = datetime_to_timestamp(as.Date(curr_d)+1),
to = datetime_to_timestamp(as.Date('2020-06-02'))
)))%>%
hc_legend(align = "left") %>%
hc_add_series(name = "Actual Data", data = og, type = "line", hcaes(x=Date, y=Deaths), color = "black") %>%
hc_tooltip(formatter = JS("function(){
return ('Date: ' + this.point.Date + ' <br> Number of Deaths: ' + this.y + ' <br> Simulation: ' + this.point.Trial)}"))
hcAs shown, these are the 20 simulations for a 10 day forecast. Lets take a deeper look.
hc <- italy %>%
subset(Date > curr_d) %>%
hchart(type = "scatter", hcaes(x=Date,y=Deaths, color = Trial),
name = "SImulation") %>%
hc_tooltip(formatter = JS("function(){
return ('Date: ' + this.point.Date + ' <br> Number of Deaths: ' + this.y + ' <br> Simulation: ' + this.point.Trial)}"))
hc <- hc %>%
hc_xAxis(type = "datetime", dateTimeLabelFormats = list(day = '%d/%b')) %>%
hc_add_series(name = "Actual Data", data = og[og$Date==curr_d,], type = "scatter", hcaes(x=Date, y=Deaths), color = "black") %>%
hc_add_theme(hc_theme_flat()) %>%
hc_title(text = "Italys' projected Deaths",
align = "left") %>%
hc_subtitle(text = "#Simulations = 20, #days = 10, using last 8 days mu/sd",
align = "left") %>%
hc_yAxis(labels = list(format = "{value}"), title = list(text = "Cumulative Number of Deaths")) %>%
hc_xAxis(title = list(text = "Date"),
plotBands = list(
list(
label = list(text = "This is the Predicted Data"),
color = "lightblue",
from = datetime_to_timestamp(as.Date(curr_d)+1),
to = datetime_to_timestamp(as.Date('2020-06-02'))
)))%>%
hc_legend(align = "left")
hcHere we can see the clustering of our Simulations. It seems the farther the forecast, the less clustered it becomes. This means that the Variance is growing as we predict the days, something that is expected as this is using a Markov Technique.
Lets take a look at the distribution of these indivdual days!
tplotd <- italy %>%
subset(Date == curr_d + 1)
hc <- hchart(density(tplotd$Deaths), type = "area", name = paste("Simulated Day #", as.character(1), sep=" "))
for (i in 2:DAYS){
tplotd <- italy %>%
subset(Date == curr_d + i)
hc <- hc %>%
hc_add_series(density(tplotd$Deaths), type = "area", name = paste("Simulated Day #", as.character(i), sep=" ")) %>%
hc_add_theme(hc_theme_flat()) %>%
hc_title(text = "Italys' Distribution of Cumulative Deaths",
align = "left") %>%
hc_subtitle(text = "#Simulations = 20, #days = 10, using last 8 days mu/sd",
align = "left") %>%
hc_yAxis(labels = list(format = "{value}"), title = list(text = "Dristribution")) %>%
hc_xAxis(title = list(text = "Cumulative Deaths")) %>%
hc_legend(align = "left") %>%
hc_tooltip(formatter = JS("function(i,curr_d){
return ('Number of Deaths: ' + this.x)}"))
}
hcThis shows what exactly is going on with each day that the algorithm predicts. As the number of predicted days increase, the distribution of our Cumulative Deaths flatten out. This means that there is more variation, or ‘less accuracy’, with each day that the algorithm predicts. This tells us that it will be more adventageous to use this algorithm as a short term predictor, rather than a long term predictor.
Now lets take a look our individual rates!
hc <- italy %>%
subset(Date > curr_d) %>%
hchart(type = "scatter", hcaes(x=Date,y=Deaths_RC, color = Trial),
name = "Simulation")
val <- og %>%
subset(Date == curr_d, select = Deaths_RC) %>%
as.numeric()
hc <- hc %>%
hc_xAxis(type = "datetime", dateTimeLabelFormats = list(day = '%d/%b')) %>%
hc_add_series(name = "Actual Data", data = og, type = "line", hcaes(x=Date, y=Deaths_RC), color = "black") %>%
hc_yAxis(title = list(text = "Rate of Change in Deaths"),
plotLines = list(list(
value = val,
color = '#ff0000',
width = 3,
zIndex = 4,
label = list(text = "Last Known Rate",
style = list( color = '#ff0000', fontWeight = 'bold' ))))) %>%
hc_add_theme(hc_theme_flat()) %>%
hc_title(text = "Italys' Rate of Change in Cumulative Deaths",
align = "left") %>%
hc_subtitle(text = "#Simulations = 20, #days = 10, using last 8 days mu/sd",
align = "left") %>%
hc_yAxis(labels = list(format = "{value}"), title = list(text = "Rate of Change from Previous Day")) %>%
hc_xAxis(title = list(text = "Date"),
plotBands = list(
list(
label = list(text = "This is the Predicted Data"),
color = "lightblue",
from = datetime_to_timestamp(as.Date(curr_d)+1),
to = datetime_to_timestamp(as.Date('2020-06-02'))
)))%>%
hc_legend(align = "left") %>%
hc_tooltip(formatter = JS("function(){
return ('Date: ' + this.point.Date + ' <br> Rate of Change: ' + this.y + ' <br> Simulation: ' + this.point.Trial)}"))
hcNow we can see that there is a clear split between rates that are going downwards and rates that are climbing upwards! This is great news, as it can generally imply that the Rate of Change will start to climb down, suggesting that what has been going on in the past 8 days is directly effecting the rate at which Covid-19 is spreading!
One thing that is possible with this algorithm, is changing the hyperparameters. I am going to import a dataset of this algorithm ran with more trials and different hyperparameters. I want to specifically limit the range to a 6 day forecast.
files <- list.files("predictions", full.names = TRUE)
pred_data <- files %>% #read in files
lapply(function(x){
read.csv(x) %>%
mutate(
Date = as.Date(Date, "%Y-%m-%d")
)
}) %>%
bind_rows()
plot_data <- pred_data %>%
subset(Split == c(4, 6, 8, 10, 30)) %>%
subset(Date > curr_d) %>%
mutate(
group = paste("Window:",as.character(Split),sep=" ")
)
hc <- hcboxplot(x = plot_data$Deaths_RC, var = plot_data$Date, var2 = plot_data$group,
outliers = FALSE) %>%
hc_add_theme(hc_theme_flat()) %>%
hc_title(text = "Italys' Simulated Deaths Rate of Change",
align = "left") %>%
hc_subtitle(text = "#Simulations = 100, #days = 6, legend = #days for mu/sd",
align = "left") %>%
hc_yAxis(labels = list(format = "{value}"), title = list(text = "Rate of Change in Deaths, per day")) %>%
hc_xAxis(title = list(text = "Date"), label = list(day = '%d/%b'))%>%
hc_legend(align = "left")Here we can see that when we use a revolving mean and standard deviation, the results can vary vastly depending on the chosen window. In the above chart, the variation of the data seems to explode when anything over 10 is chosen. This is most likely due to the fact that the rates were vastly different in the beginning stages of the spread, as the overall numbers are lower. This suggests that we should be looking at a window that is less than 10, for better results.
There is one thing I want to point out, when observing the 30 day window, the median Rate of Change is the lowest one out of all other options, for all except the first day. This suggests, that although the variance between the trials is higher, the median is still converging to a lower number.
Lets take a look deeper look at the simulations with windows that are less than 10.
It seems that nearly every simulation showed a median change that is negative. This would highly indicate that the Change in Rates is slowing down, meaning that the Rate of Deaths is overall slowing down.
lets take a look at the mean Deaths for the Trials, and compare that with the actual data we now have for italy!
pred_mean <- pred_data %>%
subset(Date > curr_d) %>%
group_by(Split, Date) %>%
summarise(
mean_D = ceiling(mean(Deaths)),
mean_D_RC = mean(Deaths_RC),
mean_D_RC_C = mean(D_RC_C),
sd_D = sd(Deaths),
sd_D_RC = sd(Deaths_RC),
sd_D_RC_C = sd(D_RC_C)
)
files <- list.files("future_data", full.names = TRUE)
new_og <- files %>% #read in files
lapply(function(x){
read.csv(x) %>%
mutate(
Date = as.Date(Date, "%Y-%m-%d")
)
}) %>%
bind_rows()
plot_d <- pred_mean %>%
subset(Split < 10) %>%
mutate(
group = paste("Window:",as.character(Split),sep=" ")
)
plot_d <- new_og %>%
rename(
mean_D = Deaths,
mean_D_RC = Deaths_RC,
mean_D_RC_C = D_RC_C
) %>%
mutate(
group = "Actual Data"
) %>%
bind_rows(plot_d)
hc <- hchart(plot_d, "column", hcaes(x = Date, y = mean_D, group = group), color =c("red", "lightblue", "lightblue", "lightblue", "lightblue", "lightblue", "lightblue", "lightblue")) %>%
hc_add_theme(hc_theme_flat()) %>%
hc_title(text = "Comparison of Simulations Mean Cumulative Deaths",
align = "left") %>%
hc_subtitle(text = "#Simulations = 100, #days = 6, legend = #days for mu/sd",
align = "left") %>%
hc_yAxis(labels = list(format = "{value}"), title = list(text = "Mean Cumulative Deaths")) %>%
hc_xAxis(title = list(text = "Date"), label = list(day = '%d/%b'))%>%
hc_legend(align = "left")
hcThis chart shows the actual Cumulative deaths for March 24 - 29, as well as the Predicted values. This tells us a few things about the simulations: